# Split each module x year across store deciles
# JHL

# Packages 
list.of.packages <- c("folderfun", "data.table", "bit64", "lubridate", "foreign", "dplyr", "ggplot2")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages,repos = "http://cran.us.r-project.org")

require(folderfun)
require(data.table)
require(dplyr)
require(foreign)
require(ggplot2)
require(lubridate)
require(bit64) # Read long integers
setNumericRounding(0)

# Setup paths
setff("path_home")
path_home <- ffpath_home()

dta_path <- sprintf("%s/raw/nielsen",path_home)
save_dir <- paste0(path_home,"/dta/nielsen/sqmy_posted_module/%s_%s_%s.rds")
do_path <- sprintf("%s/do/data_prep/01_price_index",path_home)
repository <- sprintf("%s/dta/nielsen/my_posted_module",path_home)

# Open necessary files: module and store info
mod_gp <- data.table(read.dta(sprintf("%s/Product_Hierarchy_1.22.2016.dta",dta_path)))
mod_gp <- mod_gp[,.(product_module_code,product_group_code,department_code)]
stores_0615 <- data.table(read.dta(sprintf("%s/stores_0615.dta",dta_path)))
stores_0615 <- stores_0615[,.(store_code_uc,year)] # ,fips_state_code,fips_county_code
setnames(stores_0615,"year","panel_year")

# Create (10) quantiles by store 
store_codes <- data.table(unique(stores_0615[,store_code_uc]))
setnames(store_codes,"V1","store_code_uc")
store_codes$decile <- ntile(store_codes$store_code_uc,10)
setkey(store_codes,store_code_uc)
setkey(stores_0615,store_code_uc)
stores_0615 <- store_codes[stores_0615, nomatch=0]

# File table for all tsv files in posted modules
if(!file.exists(sprintf("%s/my_posted_module_0615_files.RData",do_path))){

		setwd(repository)
		
		file.table <- system("ls -S", intern = TRUE)
		
		# Extract name of files, match with module and group codes
		file.table <- data.table(file.table)
		setnames(file.table,"file.table","file_name") 
		file.table[,c("V1","V2"):=tstrsplit(file_name,"_",fixed=T)]
		file.table[,c("V3","V4"):=tstrsplit(V2,".",fixed=T)]
		setnames(file.table,"V1","product_module_code")
		setnames(file.table,"V3","year")
		file.table[,':='(file_name=paste(repository,"/",file_name,sep=""),V2=NULL,V4=NULL,product_module_code=as.numeric(product_module_code))]
		set.seed(10)
		file.table[,':='(size_id=.I,id=sample(1:nrow(file.table), nrow(file.table), replace=F))]

		setkey(mod_gp, product_module_code)
		setkey(file.table,product_module_code)
		
		file.table <- mod_gp[file.table,nomatch=0]		
		# 8621 changed from 5507 to 5516 in 2014 
		file.table <- file.table[!(product_module_code==8621&product_group_code==5507&year>=2014)]
		file.table <- file.table[!(product_module_code==8621&product_group_code==5516&year<2014)]
		
		save(file.table, file = sprintf("%s/my_posted_module_0615_files.RData",do_path)) # Save to RData for quick loading 		 
}
	
load(sprintf("%s/my_posted_module_0615_files.RData",do_path))

# 1:100
myID <- as.numeric(Sys.getenv("SLURM_ARRAY_TASK_ID"))

# Quantiles by random id
setkey(file.table,id)
quant = quantile(unique(file.table$id),seq(0,1,by=0.01))
file.table = file.table[id>=quant[[myID]] & id<=quant[[myID+1]],]

# Split each module x year across store quantiles (10)
for(i in file.table[,id]){
	data <- readRDS(file.table[id==i,file_name])
	
	# Merge with store deciles
	setkey(stores_0615,store_code_uc,panel_year)
	setkey(data,store_code_uc,panel_year)
	data <- stores_0615[data,nomatch=0]
	
	module <-unique(file.table[id==i,product_module_code])
	panel_year <- unique(data[,panel_year])

	lapply(unique(data[,decile]),function(x){ saveRDS(data[decile==x],sprintf(save_dir,x,module,panel_year)) })
	cat(sprintf('Module %s Year %s Done ',module,panel_year))
	gc()
}
